Problem 1

For the gapminder data, perform the following operations, using the tidy::nest() function and and data frames with list-columns:

1. Fit a separate linear model of log10(gdpPercap) on year for each country.

Import gapminder package

library(gapminder)

Generate the nested data by country and continent.

gap_nested <- gapminder %>%
  group_by(country, continent) %>%
  nest()
gap_nested
## # A tibble: 142 × 3
## # Groups:   country, continent [142]
##    country     continent data             
##    <fct>       <fct>     <list>           
##  1 Afghanistan Asia      <tibble [12 × 4]>
##  2 Albania     Europe    <tibble [12 × 4]>
##  3 Algeria     Africa    <tibble [12 × 4]>
##  4 Angola      Africa    <tibble [12 × 4]>
##  5 Argentina   Americas  <tibble [12 × 4]>
##  6 Australia   Oceania   <tibble [12 × 4]>
##  7 Austria     Europe    <tibble [12 × 4]>
##  8 Bahrain     Asia      <tibble [12 × 4]>
##  9 Bangladesh  Asia      <tibble [12 × 4]>
## 10 Belgium     Europe    <tibble [12 × 4]>
## # … with 132 more rows

Fit a linear model on each country

gap_nested_lm <- gap_nested %>%
  mutate(model = map(data, ~lm(log10(gdpPercap) ~ year, data = .)))
gap_nested_lm
## # A tibble: 142 × 4
## # Groups:   country, continent [142]
##    country     continent data              model 
##    <fct>       <fct>     <list>            <list>
##  1 Afghanistan Asia      <tibble [12 × 4]> <lm>  
##  2 Albania     Europe    <tibble [12 × 4]> <lm>  
##  3 Algeria     Africa    <tibble [12 × 4]> <lm>  
##  4 Angola      Africa    <tibble [12 × 4]> <lm>  
##  5 Argentina   Americas  <tibble [12 × 4]> <lm>  
##  6 Australia   Oceania   <tibble [12 × 4]> <lm>  
##  7 Austria     Europe    <tibble [12 × 4]> <lm>  
##  8 Bahrain     Asia      <tibble [12 × 4]> <lm>  
##  9 Bangladesh  Asia      <tibble [12 × 4]> <lm>  
## 10 Belgium     Europe    <tibble [12 × 4]> <lm>  
## # … with 132 more rows

2. Plot residuals against time, showing separate lines for each country in the same plot. Also, do this separately for each continent.

Compute and add the residuals to the tibble

gap_nested_lm_resid <- gap_nested_lm %>%
  mutate(resid = map2(data, model, add_residuals)) %>%
  unnest(resid)
gap_nested_lm_resid
## # A tibble: 1,704 × 9
## # Groups:   country, continent [142]
##    country     continent data     model   year lifeExp    pop gdpPercap    resid
##    <fct>       <fct>     <list>   <list> <int>   <dbl>  <int>     <dbl>    <dbl>
##  1 Afghanistan Asia      <tibble> <lm>    1952    28.8 8.43e6      779. -0.0202 
##  2 Afghanistan Asia      <tibble> <lm>    1957    30.3 9.24e6      821.  0.00433
##  3 Afghanistan Asia      <tibble> <lm>    1962    32.0 1.03e7      853.  0.0231 
##  4 Afghanistan Asia      <tibble> <lm>    1967    34.0 1.15e7      836.  0.0164 
##  5 Afghanistan Asia      <tibble> <lm>    1972    36.1 1.31e7      740. -0.0347 
##  6 Afghanistan Asia      <tibble> <lm>    1977    38.4 1.49e7      786. -0.00641
##  7 Afghanistan Asia      <tibble> <lm>    1982    39.9 1.29e7      978.  0.0905 
##  8 Afghanistan Asia      <tibble> <lm>    1987    40.8 1.39e7      852.  0.0328 
##  9 Afghanistan Asia      <tibble> <lm>    1992    41.7 1.63e7      649. -0.0834 
## 10 Afghanistan Asia      <tibble> <lm>    1997    41.8 2.22e7      635. -0.0909 
## # … with 1,694 more rows

Residuals graph for each country

gap_nested_lm_resid %>%
  ggplot(aes(year, resid)) +
  geom_line(alpha = 1/3, aes(group = country)) +
  geom_smooth(color = "blue") +
  labs(
    title = "Residual Graph for Each Country"
  ) +
  theme(plot.title = element_text(hjust = 0.5))
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

Residuals graph for each continent

gap_nested_lm_resid %>%
  ggplot(aes(year, resid)) +
  geom_line(alpha = 1/3, aes(group = country)) +
  geom_smooth(color = "blue") +
  facet_wrap(~continent) +
  labs(
    title = "Residual Graph for Each Continent"
  ) +
  theme(plot.title = element_text(hjust = 0.5))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

3. Create a continent-wise Beeswarmplot for

(i) value of the estimated slope coefficient

gap_nested_lm_tidy <- gap_nested_lm %>%
  mutate(tidy = map(model, tidy))

gap_nested_lm_tidy %>%
  mutate(tidy = map(tidy, ~filter(., term == "year"))) %>%
  unnest(tidy) %>%
  ggplot(aes(continent, estimate)) +
  geom_boxplot() +
  geom_beeswarm() +
  labs(
    title = "Continent-wise Beeswarmplot for Slope Coefficient Value"
  ) +
  theme(plot.title = element_text(hjust = 0.5))

(ii) value of the t-statistic (ratio of estimate and standard error). [Hint: you may need to revisit the materials on broom package]. Interpret the plots.

gap_nested_lm_tidy %>%
  mutate(tidy = map(tidy, ~filter(., term == "year"))) %>%
  unnest(tidy) %>%
  ggplot(aes(continent, statistic)) +
  geom_boxplot() +
  geom_beeswarm() +
  labs(
    title = "Continent-wise Beeswarmplot for t-statistic Value"
  ) +
  theme(plot.title = element_text(hjust = 0.5))

According to the graph, we can see that

4. Identify the countries that have estimated negative slopes and p-values less than 0.05. What is the interpretation of the linear model fit for these countries?

negativeCountries <- gap_nested_lm_tidy %>%
  mutate(tidy = map(tidy, ~filter(., term == "year"))) %>%
  unnest(tidy) %>%
  filter(estimate < 0 & p.value < 0.05) %>%
  ungroup() %>%
  select(country, estimate, p.value)
negativeCountries
## # A tibble: 8 × 3
##   country                  estimate   p.value
##   <fct>                       <dbl>     <dbl>
## 1 Central African Republic -0.00461 0.0000171
## 2 Congo, Dem. Rep.         -0.0106  0.000139 
## 3 Djibouti                 -0.00354 0.0103   
## 4 Haiti                    -0.00277 0.0160   
## 5 Kuwait                   -0.0105  0.00137  
## 6 Madagascar               -0.00485 0.000337 
## 7 Niger                    -0.00394 0.00591  
## 8 Somalia                  -0.00316 0.00339

5. Plot the year-wise log10(gdpPercap) for the countries identified in step 4.

gapminder %>%
  filter(country %in% negativeCountries$country) %>%
  group_by(country, continent) %>%
  ggplot(aes(x = year, y = log10(gdpPercap), color = country)) +
  geom_line()

Problem 2

In the lecture, we discussed fitting of a linear model of mpg versus wt from the mtcars data and demonstrated evaluation of its out-of-sample performance with a k-fold cross validation. Repeat this analysis for a non-linear model mpg ~ \(\frac{k}{wt}+b\) , where k and b are model parameters and compare its performance with the linear model using an 8-fold cross validation.

Set 8-fold cross validation

mtcars_cv <- mtcars %>%
  crossv_kfold(k = 8)
mtcars_cv
## # A tibble: 8 × 3
##   train                test                .id  
##   <named list>         <named list>        <chr>
## 1 <resample [28 x 11]> <resample [4 x 11]> 1    
## 2 <resample [28 x 11]> <resample [4 x 11]> 2    
## 3 <resample [28 x 11]> <resample [4 x 11]> 3    
## 4 <resample [28 x 11]> <resample [4 x 11]> 4    
## 5 <resample [28 x 11]> <resample [4 x 11]> 5    
## 6 <resample [28 x 11]> <resample [4 x 11]> 6    
## 7 <resample [28 x 11]> <resample [4 x 11]> 7    
## 8 <resample [28 x 11]> <resample [4 x 11]> 8

Define linear model mpg ~ wt

mtcars_lm <- mtcars %>%
  lm(mpg ~ wt, .)
mtcars_lm
## 
## Call:
## lm(formula = mpg ~ wt, data = .)
## 
## Coefficients:
## (Intercept)           wt  
##      37.285       -5.344

Define non-linear model mpg ~ \(\frac{k}{wt} + b\)

mtcars_nls <- mtcars %>%
  nls(mpg ~ k / wt + b, ., start = list(k = 1, b = 0))
mtcars_nls
## Nonlinear regression model
##   model: mpg ~ k/wt + b
##    data: .
##      k      b 
## 45.829  4.386 
##  residual sum-of-squares: 230.9
## 
## Number of iterations to convergence: 1 
## Achieved convergence tolerance: 2.877e-08

Compute RSME for both model

lm_rmse_mean <- mtcars_cv %>%
  mutate(lm = map(train, ~lm(mpg ~ wt, data = .))) %$%
  map2_dbl(lm, test, rmse) %>%
  mean()

nls_rmse_mean <- mtcars_cv %>%
  mutate(
    nls = map(train, ~nls(mpg ~ k/wt+b, data = .[[1]], start = list(k = 1, b = 0)))
  ) %$%
  map2_dbl(nls, test, rmse) %>%
  mean()

print(paste("Linear model RSME mean: ", lm_rmse_mean, sep = ''))
## [1] "Linear model RSME mean: 2.80450015327895"
print(paste('Non-linear model RSME mean: ', nls_rmse_mean, sep = ''))
## [1] "Non-linear model RSME mean: 7.88101654971769"

Comparing the mean RSME of both models, we can see that linear model’s mean RSME is less than non-linear model’s one. This means that the performance of the linear model is better than non-linear model.

Problem 3

1. Download the data frames “Dataset1-Media-Example-NODES.csv” and “Dataset1-Media-Example-EDGES.csv” from https://kateto.net/network-visualization, and create an igraph network object.

Read nodes file and convert to tibble

nodes <- read.csv(
  "Dataset1-Media-Example-NODES.csv",
  header = T,
  as.is = T
) %>%
  as_tibble()
nodes
## # A tibble: 17 × 5
##    id    media               media.type type.label audience.size
##    <chr> <chr>                    <int> <chr>              <int>
##  1 s01   NY Times                     1 Newspaper             20
##  2 s02   Washington Post              1 Newspaper             25
##  3 s03   Wall Street Journal          1 Newspaper             30
##  4 s04   USA Today                    1 Newspaper             32
##  5 s05   LA Times                     1 Newspaper             20
##  6 s06   New York Post                1 Newspaper             50
##  7 s07   CNN                          2 TV                    56
##  8 s08   MSNBC                        2 TV                    34
##  9 s09   FOX News                     2 TV                    60
## 10 s10   ABC                          2 TV                    23
## 11 s11   BBC                          2 TV                    34
## 12 s12   Yahoo News                   3 Online                33
## 13 s13   Google News                  3 Online                23
## 14 s14   Reuters.com                  3 Online                12
## 15 s15   NYTimes.com                  3 Online                24
## 16 s16   WashingtonPost.com           3 Online                28
## 17 s17   AOL.com                      3 Online                33

Read edges files and convert to tibble

links <- read.csv(
  "Dataset1-Media-Example-EDGES.csv",
  header = T,
  as.is = T
) %>%
  as_tibble()
links
## # A tibble: 49 × 4
##    from  to    type      weight
##    <chr> <chr> <chr>      <int>
##  1 s01   s02   hyperlink     22
##  2 s01   s03   hyperlink     22
##  3 s01   s04   hyperlink     21
##  4 s01   s15   mention       20
##  5 s02   s01   hyperlink     23
##  6 s02   s03   hyperlink     21
##  7 s02   s09   hyperlink      1
##  8 s02   s10   hyperlink      5
##  9 s03   s01   hyperlink     21
## 10 s03   s04   hyperlink     22
## # … with 39 more rows

Create the igraph object

net_igraph <- graph_from_data_frame(d = links, vertices = nodes, directed = T)
net_igraph
## IGRAPH a85b62b DNW- 17 49 -- 
## + attr: name (v/c), media (v/c), media.type (v/n), type.label (v/c),
## | audience.size (v/n), type (e/c), weight (e/n)
## + edges from a85b62b (vertex names):
##  [1] s01->s02 s01->s03 s01->s04 s01->s15 s02->s01 s02->s03 s02->s09 s02->s10
##  [9] s03->s01 s03->s04 s03->s05 s03->s08 s03->s10 s03->s11 s03->s12 s04->s03
## [17] s04->s06 s04->s11 s04->s12 s04->s17 s05->s01 s05->s02 s05->s09 s05->s15
## [25] s06->s06 s06->s16 s06->s17 s07->s03 s07->s08 s07->s10 s07->s14 s08->s03
## [33] s08->s07 s08->s09 s09->s10 s10->s03 s12->s06 s12->s13 s12->s14 s13->s12
## [41] s13->s17 s14->s11 s14->s13 s15->s01 s15->s04 s15->s06 s16->s06 s16->s17
## [49] s17->s04

Plot the graph

plot(net_igraph)

2. Convert the igraph to a network object in the sna package

net_network <- asNetwork(net_igraph)
net_network
##  Network attributes:
##   vertices = 17 
##   directed = TRUE 
##   hyper = FALSE 
##   loops = TRUE 
##   multiple = FALSE 
##   bipartite = FALSE 
##   total edges= 49 
##     missing edges= 0 
##     non-missing edges= 49 
## 
##  Vertex attribute names: 
##     audience.size media media.type type.label vertex.names 
## 
##  Edge attribute names: 
##     type weight

3. Plot the network using separate colors for the nodes based on the vertex attribute media.type and make the size of the nodes proportional to the vertex attribute audience.size. [Hint: Use network:;get.vertex.attribute]

net_network %>%
  ggnet2(size = "audience.size", color = "media.type", label = T)

4. Calculate the mean degree and density of the network using appropriate functions in the sna package.

Mean degree

net_network %>%
  degree(gmode = "graph") %>%
  mean()
## [1] 2.823529

Density

net_network %>%
  network.density()
## [1] 0.1695502

Problem 4

Scrape the country-wise population data from https://www.worldometers.info/world-population/population-by-country/. Plot the population density (\(P/Km^2\)) obtained from this table on a country-wise choropleth map. Make sure to

1. Clean the data to make it compatible with the country-wise world choropleth map

Scrape the data

url <- "https://www.worldometers.info/world-population/population-by-country/"
country_population <- url %>%
  read_html() %>%
  html_nodes("table") %>%
  html_table(fill = TRUE) %>%
  .[[1]]
country_population
## # A tibble: 235 × 12
##      `#` `Country (or dependency)` `Population (2…` `Yearly Change` `Net Change`
##    <int> <chr>                     <chr>            <chr>           <chr>       
##  1     1 China                     1,439,323,776    0.39 %          5,540,090   
##  2     2 India                     1,380,004,385    0.99 %          13,586,631  
##  3     3 United States             331,002,651      0.59 %          1,937,734   
##  4     4 Indonesia                 273,523,615      1.07 %          2,898,047   
##  5     5 Pakistan                  220,892,340      2.00 %          4,327,022   
##  6     6 Brazil                    212,559,417      0.72 %          1,509,890   
##  7     7 Nigeria                   206,139,589      2.58 %          5,175,990   
##  8     8 Bangladesh                164,689,383      1.01 %          1,643,222   
##  9     9 Russia                    145,934,462      0.04 %          62,206      
## 10    10 Mexico                    128,932,753      1.06 %          1,357,224   
## # … with 225 more rows, and 7 more variables: `Density (P/Km²)` <chr>,
## #   `Land Area (Km²)` <chr>, `Migrants (net)` <chr>, `Fert. Rate` <chr>,
## #   `Med. Age` <chr>, `Urban Pop %` <chr>, `World Share` <chr>

Clean the data

country_population <- country_population %>%
  rename(
    `Density (P/Km2)` = `Density (P/Km²)`,
    `Land Area (Km2)` = `Land Area (Km²)`
  ) %>%
  mutate(
    `Country (or dependency)` = tolower(`Country (or dependency)`),
    `Population (2020)` = as.integer(
      str_replace_all(.$`Population (2020)`, ",", "")
    ),
    `Yearly Change` = as.double(
      str_replace_all(.$`Yearly Change`, " %", "")
    ),
    `Net Change` = as.integer(
      str_replace_all(.$`Net Change`, ',', '')
    ),
    `Density (P/Km2)` = as.integer(
      str_replace_all(.$`Density (P/Km2)`, ',', '')
    ),
    `Land Area (Km2)` = as.integer(
      str_replace_all(.$`Land Area (Km2)`, ',', '')
    ),
    `Migrants (net)` = as.integer(
      str_replace_all(.$`Migrants (net)`, ',', '')
    ),
    `Fert. Rate` = as.double(`Fert. Rate`),
    `Med. Age` = as.integer(`Med. Age`),
    `Urban Pop %` = as.integer(
      str_replace_all(.$`Urban Pop %`, ' %', '')
    ),
    `World Share` = as.double(
      str_replace_all(.$`World Share`, ' %', '')
    )
  )

country_population
## # A tibble: 235 × 12
##      `#` `Country (or dependency)` `Population (2…` `Yearly Change` `Net Change`
##    <int> <chr>                                <int>           <dbl>        <int>
##  1     1 china                           1439323776            0.39      5540090
##  2     2 india                           1380004385            0.99     13586631
##  3     3 united states                    331002651            0.59      1937734
##  4     4 indonesia                        273523615            1.07      2898047
##  5     5 pakistan                         220892340            2         4327022
##  6     6 brazil                           212559417            0.72      1509890
##  7     7 nigeria                          206139589            2.58      5175990
##  8     8 bangladesh                       164689383            1.01      1643222
##  9     9 russia                           145934462            0.04        62206
## 10    10 mexico                           128932753            1.06      1357224
## # … with 225 more rows, and 7 more variables: `Density (P/Km2)` <int>,
## #   `Land Area (Km2)` <int>, `Migrants (net)` <int>, `Fert. Rate` <dbl>,
## #   `Med. Age` <int>, `Urban Pop %` <int>, `World Share` <dbl>

2. Maximize the overlap between the two data frames (the one obtained from the scraped data and the choropleth country data frame), i.e., if a country appears in both data frames, possibly with different names, it must be plotted

Maximize the overlap

country_population <- country_population %>%
  mutate(
    `Country (or dependency)` = case_when(
      `Country (or dependency)` == "united states" ~ "united states of america",
      `Country (or dependency)` == 'serbia' ~ 'republic of serbia',
      `Country (or dependency)` == 'tanzania' ~ 'united republic of tanzania',
      `Country (or dependency)` == 'north macedonia' ~ 'macedonia',
      `Country (or dependency)` == 'bahamas' ~ 'the bahamas',
      `Country (or dependency)` == 'timor-leste' ~ 'east timor',
      str_detect(`Country (or dependency)`, "ivoire") ~ 'ivory coast',
      `Country (or dependency)` == 'dr congo' ~ 'democratic republic of the congo',
      `Country (or dependency)` == 'congo' ~ 'republic of congo',
      `Country (or dependency)` == 'czech republic (czechia)' ~ 'czech republic',
      `Country (or dependency)` == 'guinea-bissau' ~ 'guinea bissau',
      T ~ `Country (or dependency)`
    )
  )

3. List the countries, if any, in the scraped data frame that do not appear in the choropleth country data frame (after appropriate cleaning)

data(country.regions)
setdiff(
  country.regions$region,
  country_population$`Country (or dependency)`
) %>%
  as_tibble_col(column_name = "region")
## # A tibble: 5 × 1
##   region         
##   <chr>          
## 1 somaliland     
## 2 swaziland      
## 3 northern cyprus
## 4 antarctica     
## 5 kosovo

4. List the countries, if any, in the choropleth country data frame that do not appear in the scraped data frame (after appropriate clearning)

setdiff(
  country_population$`Country (or dependency)`,
  country.regions$region
) %>%
  as_tibble_col(column_name = "region")
## # A tibble: 69 × 1
##    region            
##    <chr>             
##  1 hong kong         
##  2 singapore         
##  3 state of palestine
##  4 puerto rico       
##  5 bahrain           
##  6 mauritius         
##  7 eswatini          
##  8 réunion           
##  9 comoros           
## 10 macao             
## # … with 59 more rows

Plot the map

country_population %>%
  mutate(region = `Country (or dependency)`, value = `Density (P/Km2)`) %>%
  country_choropleth(title = 'World Population Density Map', num_colors = 9) +
  theme(plot.title = element_text(hjust = 0.5))

Problem 5

Obtain 2016-2020 5-year aggregated ACS tract-wise data on NJ median household income and rental. Combine them into a single data frame.

1. Plot the tract-wise rental against the median household income and comment.

Set API key

census_api_key("44ed2f559fc8ff1593dc792e28e54bd655f7f900", install = T, overwrite = T)
## Your original .Renviron will be backed up and stored in your R HOME directory if needed.
## Your API key has been stored in your .Renviron and can be accessed by Sys.getenv("CENSUS_API_KEY"). 
## To use now, restart R or run `readRenviron("~/.Renviron")`
## [1] "44ed2f559fc8ff1593dc792e28e54bd655f7f900"

Obtain the data and combine both income and rental to one tibble

nj_median_household_income_and_rental <- get_acs(
  geography = "tract",
  variables = c(medincome = "B19013_001", medrental = "B25064_001"),
  state = "NJ",
  year = 2020,
  geometry = T
) %>%
  as_tibble() %>%
  pivot_wider(
    names_from = variable,
    values_from = c(estimate, moe)
  )
## Getting data from the 2016-2020 5-year ACS
## Downloading feature geometry from the Census website.  To cache shapefiles for use in future sessions, set `options(tigris_use_cache = TRUE)`.
nj_median_household_income_and_rental
## # A tibble: 2,181 × 7
##    GEOID       NAME                   geometry estimate_medinc… estimate_medren…
##    <chr>       <chr>        <MULTIPOLYGON [°]>            <dbl>            <dbl>
##  1 34021002200 Cens… (((-74.74945 40.22568, -…            54860             1121
##  2 34021004404 Cens… (((-74.50135 40.25484, -…           100282             1685
##  3 34021000600 Cens… (((-74.74294 40.21703, -…            66029             1180
##  4 34021001200 Cens… (((-74.81737 40.24094, -…            49659             1163
##  5 34025805700 Cens… (((-74.01226 40.30434, -…            57823             1199
##  6 34025809703 Cens… (((-74.29578 40.3226, -7…           130705             2262
##  7 34025807004 Cens… (((-74.00926 40.23071, -…            52440             1210
##  8 34025810300 Cens… (((-74.3717 40.29863, -7…            84091             1691
##  9 34025806504 Cens… (((-74.05513 40.25025, -…            72267             1200
## 10 34025802500 Cens… (((-74.23142 40.44554, -…            97232             1136
## # … with 2,171 more rows, and 2 more variables: moe_medincome <dbl>,
## #   moe_medrental <dbl>

Import two library to help draw the graph

library(biscale)
library(cowplot)
## 
## Attaching package: 'cowplot'
## The following object is masked from 'package:lubridate':
## 
##     stamp

Define the bi-class

nj_median_household_income_and_rental <- nj_median_household_income_and_rental %>%
  bi_class(
    x = estimate_medincome,
    y = estimate_medrental,
    style = "quantile",
    dim = 3
  )
nj_median_household_income_and_rental
## # A tibble: 2,181 × 8
##    GEOID       NAME                   geometry estimate_medinc… estimate_medren…
##    <chr>       <chr>        <MULTIPOLYGON [°]>            <dbl>            <dbl>
##  1 34021002200 Cens… (((-74.74945 40.22568, -…            54860             1121
##  2 34021004404 Cens… (((-74.50135 40.25484, -…           100282             1685
##  3 34021000600 Cens… (((-74.74294 40.21703, -…            66029             1180
##  4 34021001200 Cens… (((-74.81737 40.24094, -…            49659             1163
##  5 34025805700 Cens… (((-74.01226 40.30434, -…            57823             1199
##  6 34025809703 Cens… (((-74.29578 40.3226, -7…           130705             2262
##  7 34025807004 Cens… (((-74.00926 40.23071, -…            52440             1210
##  8 34025810300 Cens… (((-74.3717 40.29863, -7…            84091             1691
##  9 34025806504 Cens… (((-74.05513 40.25025, -…            72267             1200
## 10 34025802500 Cens… (((-74.23142 40.44554, -…            97232             1136
## # … with 2,171 more rows, and 3 more variables: moe_medincome <dbl>,
## #   moe_medrental <dbl>, bi_class <chr>

Create the map

map <- nj_median_household_income_and_rental %>%
  ggplot() +
  geom_sf(
    aes(fill = bi_class, geometry = geometry),
    color = "white",
    size = 0.1,
    show.legend = F
  ) + 
  bi_scale_fill(pal = "DkBlue", dim = 3) +
  labs(
    x = NULL,
    y = NULL,
    title = "Tract-wise NJ Median Household Income Against Median Household Rental"
  ) +
  # theme_void() +
  theme(
    plot.title = element_text(hjust = 0.5)
  )
map

Create the legend

legend <- bi_legend(
  pal = "DkBlue",
  dim = 3,
  xlab = "Median income",
  ylab = "Median rental",
  size = 8
)
legend

Combine the map with the legend to get the final plot

ggdraw() +
  draw_plot(map, 0, 0, 1, 1) +
  draw_plot(legend, 0.7, 0.15, 0.3, 0.3)

Based on the above graph, most counties in the center Jersey and north Jersey are blue which means that people in these areas have a high income. Meanwhile, some counties in the south Jersey are in purple which means that people lived there may have a relatively high rental.

2. Fit a linear regression equation of rental against median household income and report the summary.

Fit a linear model to the data and show the summary of the model

lm <- nj_median_household_income_and_rental %>%
  lm(estimate_medrental ~ estimate_medincome, .)
lm %>%
  tidy() %>%
  unnest()
## Warning: `cols` is now required when using unnest().
## Please use `cols = c()`
## # A tibble: 2 × 5
##   term                estimate std.error statistic   p.value
##   <chr>                  <dbl>     <dbl>     <dbl>     <dbl>
## 1 (Intercept)        833.      20.0           41.8 3.54e-275
## 2 estimate_medincome   0.00756  0.000203      37.2 2.98e-231
lm %>%
  glance
## # A tibble: 1 × 12
##   r.squared adj.r.squared sigma statistic   p.value    df  logLik    AIC    BIC
##       <dbl>         <dbl> <dbl>     <dbl>     <dbl> <dbl>   <dbl>  <dbl>  <dbl>
## 1     0.407         0.407  380.     1385. 2.98e-231     1 -14872. 29749. 29766.
## # … with 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>

Visualize the linear model

nj_median_household_income_and_rental %>%
  add_predictions(lm) %>%
  ggplot() +
  geom_point(aes(estimate_medincome, estimate_medrental)) +
  geom_line(aes(estimate_medincome, pred), color = 'red')
## Warning: Removed 160 rows containing missing values (geom_point).
## Warning: Removed 23 row(s) containing missing values (geom_path).

3. Looking at the plot, suggest ways to improve the model fit. Fit the improved model and report the R2.

Fit a log10 model to the data and show the summary of the model

nls <- nj_median_household_income_and_rental %>%
  lm(
    estimate_medrental ~ log(estimate_medincome, 10),
    .,
  )
nls %>%
  tidy %>%
  unnest()
## Warning: `cols` is now required when using unnest().
## Please use `cols = c()`
## # A tibble: 2 × 5
##   term                        estimate std.error statistic   p.value
##   <chr>                          <dbl>     <dbl>     <dbl>     <dbl>
## 1 (Intercept)                   -5523.     197.      -28.0 6.54e-146
## 2 log(estimate_medincome, 10)    1434.      40.2      35.6 2.55e-216
nls %>%
  glance
## # A tibble: 1 × 12
##   r.squared adj.r.squared sigma statistic   p.value    df  logLik    AIC    BIC
##       <dbl>         <dbl> <dbl>     <dbl>     <dbl> <dbl>   <dbl>  <dbl>  <dbl>
## 1     0.386         0.386  387.     1271. 2.55e-216     1 -14906. 29818. 29835.
## # … with 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
nj_median_household_income_and_rental %>%
  add_predictions(nls) %>%
  ggplot() +
  geom_point(aes(estimate_medincome, estimate_medrental)) +
  geom_line(aes(estimate_medincome, pred), color = 'red')
## Warning: Removed 160 rows containing missing values (geom_point).
## Warning: Removed 23 row(s) containing missing values (geom_path).

4. Obtain the rental data for year=2020 (5-year aggregate from 2016-2020) and year=2015 (5-year aggregate from 2011-2015), and plot the percentage changes for each county in a column or bar diagram, in increasing or decreasing order of percentage increase.

Get the rental data from 2011-2015

nj_median_rental_2011_2015 <- get_acs(
  geograph = "county",
  variables = c(medrental = "B25064_001"),
  state = "NJ",
  year = 2015,
  geometry = T
) %>%
  as_tibble() %>%
  rename(
    `2011-2015 Rental` = estimate
  )
## Getting data from the 2011-2015 5-year ACS
## Downloading feature geometry from the Census website.  To cache shapefiles for use in future sessions, set `options(tigris_use_cache = TRUE)`.
nj_median_rental_2011_2015
## # A tibble: 21 × 6
##    GEOID NAME          variable `2011-2015 Ren…`   moe                  geometry
##    <chr> <chr>         <chr>               <dbl> <dbl>        <MULTIPOLYGON [°]>
##  1 34007 Camden Count… medrent…              978    10 (((-75.13598 39.8932, -7…
##  2 34013 Essex County… medrent…             1068     8 (((-74.37623 40.76275, -…
##  3 34019 Hunterdon Co… medrent…             1328    36 (((-75.19511 40.57969, -…
##  4 34023 Middlesex Co… medrent…             1299    13 (((-74.63 40.34341, -74.…
##  5 34041 Warren Count… medrent…             1013    29 (((-75.20392 40.6915, -7…
##  6 34001 Atlantic Cou… medrent…             1047    21 (((-74.98522 39.5148, -7…
##  7 34015 Gloucester C… medrent…             1072    25 (((-75.4383 39.7844, -75…
##  8 34021 Mercer Count… medrent…             1132    14 (((-74.94228 40.34089, -…
##  9 34029 Ocean County… medrent…             1322    21 (((-74.55311 40.07913, -…
## 10 34031 Passaic Coun… medrent…             1184    12 (((-74.50288 41.08592, -…
## # … with 11 more rows

Get the rental data from 2016-2020

nj_median_rental_2016_2020 <- get_acs(
  geograph = "county",
  variables = c(medrental = "B25064_001"),
  state = "NJ",
  year = 2020,
  geometry = T
) %>%
  as_tibble() %>%
  rename(
    `2016-2020 Rental` = estimate
  )
## Getting data from the 2016-2020 5-year ACS
## Downloading feature geometry from the Census website.  To cache shapefiles for use in future sessions, set `options(tigris_use_cache = TRUE)`.
nj_median_rental_2016_2020
## # A tibble: 21 × 6
##    GEOID NAME          variable `2016-2020 Ren…`   moe                  geometry
##    <chr> <chr>         <chr>               <dbl> <dbl>        <MULTIPOLYGON [°]>
##  1 34041 Warren Count… medrent…             1128    29 (((-75.20392 40.6915, -7…
##  2 34033 Salem County… medrent…             1024    41 (((-75.41272 39.38437, -…
##  3 34005 Burlington C… medrent…             1388    21 (((-75.05965 39.99215, -…
##  4 34021 Mercer Count… medrent…             1311    23 (((-74.94295 40.34164, -…
##  5 34025 Monmouth Cou… medrent…             1437    24 (((-74.61353 40.18267, -…
##  6 34017 Hudson Count… medrent…             1450    14 (((-74.0422 40.69997, -7…
##  7 34031 Passaic Coun… medrent…             1310    13 (((-74.50321 41.08587, -…
##  8 34027 Morris Count… medrent…             1622    30 (((-74.88923 40.78883, -…
##  9 34013 Essex County… medrent…             1211    11 (((-74.37623 40.76275, -…
## 10 34009 Cape May Cou… medrent…             1176    45 (((-74.97191 38.94058, -…
## # … with 11 more rows

Combine two data sets and compute the percentage change

nj_median_rental <- inner_join(
  nj_median_rental_2011_2015,
  nj_median_rental_2016_2020,
  by = "GEOID"
) %>%
  mutate(
    `Percentage Change` = (`2016-2020 Rental` - `2011-2015 Rental`)/`2011-2015 Rental`,
    County = str_match(NAME.x, "(?<county>\\S*) County*")[, "county"]
  ) %>%
  select(
    County,
    `2011-2015 Rental`,
    `2016-2020 Rental`,
    `Percentage Change`
  )
nj_median_rental
## # A tibble: 21 × 4
##    County     `2011-2015 Rental` `2016-2020 Rental` `Percentage Change`
##    <chr>                   <dbl>              <dbl>               <dbl>
##  1 Camden                    978               1107              0.132 
##  2 Essex                    1068               1211              0.134 
##  3 Hunterdon                1328               1443              0.0866
##  4 Middlesex                1299               1495              0.151 
##  5 Warren                   1013               1128              0.114 
##  6 Atlantic                 1047               1129              0.0783
##  7 Gloucester               1072               1258              0.174 
##  8 Mercer                   1132               1311              0.158 
##  9 Ocean                    1322               1459              0.104 
## 10 Passaic                  1184               1310              0.106 
## # … with 11 more rows

Plot the graph

nj_median_rental %>%
  ggplot(
    aes(
      x = reorder(County, -`Percentage Change`),
      y = `Percentage Change`
    )
  ) +
  geom_bar(stat = "identity") +
  labs(
    x = "County",
    y = "Percentage Change",
    title = "The Percentage Change of Each County between 2011-2015 to 2016-2020"
  ) +
  theme(plot.title = element_text(hjust = 0.5))